Install a package countrycode first.
library(tidyverse)
library(WDI)
library(readxl)
library(countrycode)
World Inequality Report 2022: https://wir2022.wid.world
Methodology [Link]
Since Excel files are binary file, you need to add mode = “wb”, web binary.
url_summary <- "https://wir2022.wid.world/www-site/uploads/2022/03/WIR2022TablesFigures-Summary.xlsx"
download.file(url = url_summary, destfile = "data/WIR2022s.xlsx", mode = "wb")
trying URL 'https://wir2022.wid.world/www-site/uploads/2022/03/WIR2022TablesFigures-Summary.xlsx'
Content type 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' length 908659 bytes (887 KB)
==================================================
downloaded 887 KB
library(readxl) # readxl is a part of tidyverse but not a core package
excel_sheets("data/WIR2022s.xlsx")
[1] "Index" "F1" "F2" "F3" "F4" "F5." "F6"
[8] "F7" "F8" "F9" "F10" "F11" "F12" "F13"
[15] "F14" "F15" "T1" "data-F1" "data-F2" "data-F3" "data-F4"
[22] "data-F5" "data-F6" "data-F7" "data-F8" "data-F9" "data-F10" "data-F11"
[29] "data-F12" "data-F13." "data-F14." "data-F15"
df_wir_f2 <- read_excel("data/WIR2022s.xlsx",
sheet = "data-F2")
df_wir_f2
Created a new Excel book with only one sheet, and save it as a CSV UTF-8.
df_wir_f2_2 <- read_csv("data/wir-f2.csv")
Rows: 8 Columns: 5── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): iso, Bottom 50%, Middle 40%, Top 10%
dbl (1): year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_wir_f2_2
df_wir_f2_2 |> identical(df_wir_f2)
[1] FALSE
df_wir_f2_3 <- read_delim(clipboard())
df_wir_f2_3
df_wir_f2 |> pivot_longer(3:5) |>
ggplot(aes(iso, value, fill = name)) + geom_col(position = "dodge") + coord_flip()
Popular statistical tables
Explorer - datamarts: http://data.un.org/Explorer.aspx
Copy the link of International Migrants and Refugees
un_migrants_url <- "https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv"
download.file(un_migrants_url, destfile = "data/migrants.csv")
trying URL 'https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv'
Content type 'application/octet-stream' length 1634042 bytes (1.6 MB)
==================================================
downloaded 1.6 MB
df_un_migrants <- read_csv("data/migrants.csv")
New names:Rows: 7239 Columns: 7── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): T04, International migrants and refugees, ...3, ...4, ...5, ...6, ...7
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_migrants
df_un_migrants <- read_csv("data/migrants.csv", skip=1)
New names:Rows: 7238 Columns: 7── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ...2, Series, Footnotes, Source
dbl (2): Region/Country/Area, Year
num (1): Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_migrants
url_un_migrants <- "https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv"
Since the url is given, a CSV file can be read directly using
read_csv.
df_un_migrants <- read_csv(url_un_migrants, skip = 1)
New names:Rows: 7238 Columns: 7── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ...2, Series, Footnotes, Source
dbl (2): Region/Country/Area, Year
num (1): Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_migrants
str(df_un_migrants)
spc_tbl_ [7,238 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Region/Country/Area: num [1:7238] 1 1 1 1 1 1 1 1 1 1 ...
$ ...2 : chr [1:7238] "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" ...
$ Year : num [1:7238] 2005 2005 2005 2005 2010 ...
$ Series : chr [1:7238] "International migrant stock: Both sexes (number)" "International migrant stock: Both sexes (% total population)" "International migrant stock: Male (% total Population)" "International migrant stock: Female (% total Population)" ...
$ Value : num [1:7238] 1.91e+08 2.90 3.00 2.90 2.21e+08 ...
$ Footnotes : chr [1:7238] NA NA NA NA ...
$ Source : chr [1:7238] "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." ...
- attr(*, "spec")=
.. cols(
.. `Region/Country/Area` = col_double(),
.. ...2 = col_character(),
.. Year = col_double(),
.. Series = col_character(),
.. Value = col_number(),
.. Footnotes = col_character(),
.. Source = col_character()
.. )
- attr(*, "problems")=<externalptr>
df_un_migrants |> summary()
Region/Country/Area ...2 Year Series
Min. : 1.0 Length:7238 Min. :2005 Length:7238
1st Qu.:178.0 Class :character 1st Qu.:2010 Class :character
Median :404.0 Mode :character Median :2015 Mode :character
Mean :407.1 Mean :2014
3rd Qu.:630.0 3rd Qu.:2020
Max. :894.0 Max. :2022
Value Footnotes Source
Min. : 0 Length:7238 Length:7238
1st Qu.: 5 Class :character Class :character
Median : 154 Mode :character Mode :character
Mean : 731059
3rd Qu.: 31096
Max. :280598105
df_un_migrants |> select(Year,Series) |> lapply(unique)
$Year
[1] 2005 2010 2015 2020 2022 2019 2018 2016 2021 2017
$Series
[1] "International migrant stock: Both sexes (number)"
[2] "International migrant stock: Both sexes (% total population)"
[3] "International migrant stock: Male (% total Population)"
[4] "International migrant stock: Female (% total Population)"
[5] "Total refugees and people in refugee-like situations (number)"
[6] "Asylum seekers, including pending cases (number)"
[7] "Other of concern to UNHCR (number)"
[8] "Total population of concern to UNHCR (number)"
df_un_migrants |> distinct(`Region/Country/Area`, ...2)
library(countrycode)
df_un_migrants %>% mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
Caused by warning:
! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants |> mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2) |> filter(is.na(iso2c_un)) |> distinct(...2)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
Caused by warning:
! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
wdicache <- read_rds("data/wdicache.rds")
wdi_country_extra <- wdicache$country |> select(iso2c, region, income, lending)
df_un_migrants_ext <- df_un_migrants %>%
mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2) |>
drop_na(iso2c_un) |>
select(Country = ...2, ISO2C = iso2c_un, Year, Series, Value, Footnotes) |>
left_join(wdi_country_extra, by = c("ISO2C" = "iso2c"))
df_un_migrants_ext
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
Caused by warning:
! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants_ext$Series |> unique()
[1] "International migrant stock: Both sexes (number)"
[2] "International migrant stock: Both sexes (% total population)"
[3] "International migrant stock: Male (% total Population)"
[4] "International migrant stock: Female (% total Population)"
[5] "Total refugees and people in refugee-like situations (number)"
[6] "Asylum seekers, including pending cases (number)"
[7] "Other of concern to UNHCR (number)"
[8] "Total population of concern to UNHCR (number)"
df_un_migrants_ext_rev <- df_un_migrants_ext |> mutate(Ser = case_when(
Series == "International migrant stock: Both sexes (number)" ~ "migrant",
Series == "International migrant stock: Both sexes (% total population)" ~ "migrant_percent",
Series == "International migrant stock: Male (% total Population)" ~ "migrant_male",
Series == "International migrant stock: Female (% total Population)" ~ "migrant_female",
Series == "Total refugees and people in refugee-like situations (number)" ~ "refugee",
Series == "Asylum seekers, including pending cases (number)" ~ "asylum",
Series == "Other of concern to UNHCR (number)" ~ "other",
Series == "Total population of concern to UNHCR (number)" ~ "concern",
TRUE ~ Series), .before = Series)
df_un_migrants_ext_rev
dput(df_un_migrants_ext_rev)
Environment Statistics Database>Water>Internal flow [Link]
Select columns to add Table ID and Country and Area Code of UN, and download ‘Comma’ a CSV.
df_un_water <- read_csv("data/UNdata_Export_20240124_010609671.csv")
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 1604 Columns: 7── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Country or Area Code, Country or Area, Unit
dbl (4): Table ID, Year, Value, Value Footnotes
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_water
OECD data https://data.oecd.org/
There is a newly developed Database Access linked above. However, it is still under development and difficult to handle data there.
Permanent immigrant inflows Total, Number, 2022 Link
Definition of Permanent immigrant inflows
library(htmltools)
iframe_code <- '<iframe src="https://data.oecd.org/chart/7kgk" width="860" height="645" style="border: 0" mozallowfullscreen="true" webkitallowfullscreen="true" allowfullscreen="true"><a href="https://data.oecd.org/chart/7kgk" target="_blank">OECD Chart: Permanent immigrant inflows, Total, Number, Annual, 2022</a></iframe>'
HTML(iframe_code)
Data Site: https://data.oecd.org/migration/permanent-immigrant-inflows.htm
Definition of Permanent immigrant inflows: Permanent immigrant inflows cover regulated movements of foreigners considered to be settling in the country from the perspective of the destination country. They cover regulated movements of foreigners as well as free movement migration. The data presented are the result of a standardisation process that allows for cross-country comparisons. This indicator is measured by numbers of permanent inflows.
Citation: OECD (2024), Permanent immigrant inflows (indicator). doi: 10.1787/304546b6-en (Accessed on 28 January 2024)
Categories: Total, Work, Free movements, Family, Family accompanying workers, Humanitarian, Other
library(tidyverse)
── Attaching core tidyverse packages ───────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2 ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
Read the downloaded full data of ‘Permanent Immigrant Inflows’.
df_inflows <- read_csv("data/DP_LIVE_28012024004117279.csv")
Rows: 3597 Columns: 8── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): LOCATION, INDICATOR, SUBJECT, MEASURE, FREQUENCY
dbl (2): TIME, Value
lgl (1): Flag Codes
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_inflows
df_inflows |> select(-Value) |> lapply(unique)
$LOCATION
[1] "AUS" "AUT" "BEL" "CAN" "DNK" "FIN" "FRA" "DEU" "IRL" "ITA" "JPN" "KOR" "NLD" "NZL"
[15] "NOR" "PRT" "ESP" "SWE" "CHE" "GBR" "USA" "ISR" "RUS" "MEX" "CZE" "LUX" "POL" "EST"
$INDICATOR
[1] "IMMIGINFLOW"
$SUBJECT
[1] "FAM" "FAMWORKR" "FREEMOVS" "HUMNTRN" "OTH" "WORK" "TOT"
$MEASURE
[1] "NBR"
$FREQUENCY
[1] "A"
$TIME
[1] 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
[18] 2020 2021 2022 2000 2001 2002 1995 1996 1997 1998 1999
$`Flag Codes`
[1] NA
Set the order of categories to display.
CAT <- c("TOT", "WORK", "FREEMOVS", "FAM", "FAMWORKR", "HUMNTRN", "OTH")
Add country names using the countrycode package and
delete INDICATOR, MEASURE, FREQUENCY, Flag Codes.
df_in <- df_inflows |>
mutate(country = countrycode(LOCATION, "iso3c", "country.name"), .before = LOCATION) |>
select(country, iso3c = LOCATION, category = SUBJECT, year = TIME, value = Value)
df_in
df_in |> filter(country == "Japan") |>
ggplot(aes(year, value, col = factor(category, levels = CAT))) + geom_line() + labs(col = "Categories")
df_in |> filter(country == "Japan") |> filter(category != "TOT") |>
ggplot(aes(year, value, fill = factor(category, levels = rev(CAT)))) +
geom_area(col = "black", linewidth = 0.1) +
labs(title = "Permanent immigrant inflows of Japan", fill = "Categories")
df_in |> filter(category != "TOT") |>
ggplot(aes(year, value, fill = factor(category, levels = rev(CAT)))) + geom_area(col = "black", linewidth = 0.1) +
facet_wrap(~country) +
labs(title = "Permanent immigrant inflows of 28 Countries", fill = "") +
theme(legend.position = 'bottom')
df_in |> filter(country %in% c("Germany", "United States")) |>
ggplot(aes(year, value, col = country, linetype = factor(category, levels = CAT))) + geom_line() +
labs(title = "Permanent immigrant inflows of Germany and United States", linetype = "Categories")
tot_lev <- df_in |> pivot_wider(names_from = category, values_from = value) |>
pivot_longer(cols = CAT[CAT!="TOT"], names_to = "category", values_to = "value") |> filter(year == 2022) |> distinct(country, iso3c, TOT) |> arrange(desc(TOT)) |> pull(country)
df_in |> pivot_wider(names_from = category, values_from = value) |>
pivot_longer(cols = CAT[CAT!="TOT"], names_to = "category", values_to = "value") |> filter(year == 2022) |> ggplot(aes(factor(country, levels = rev(tot_lev)), value, fill = factor(category, levels = CAT))) + geom_col(col = "black", linewidth = 0.1) + coord_flip() +
labs(title = "Permanent immigrant inflows in 2022", fill = "", x = "") +
theme(legend.position = 'bottom')
Data Site: https://data.oecd.org/migration/foreign-population.htm
Definition of Foreign population: The foreign population consists of people who still have the nationality of their home country. It may include people born in the host country. The difference across countries between the size of the foreign-born population and that of the foreign population depends on the rules governing the acquisition of citizenship in each country. This indicator is measured as a percentage of population.
Citation: OECD (2024), Foreign population (indicator). doi: 10.1787/16a914e3-en (Accessed on 28 January 2024)
Categories: Total, Percent of Population
Read the downloaded full data of ‘Foreign population’.
df_foreign <- read_csv("data/DP_LIVE_28012024004200124.csv")
Rows: 545 Columns: 8── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): LOCATION, INDICATOR, SUBJECT, MEASURE, FREQUENCY
dbl (2): TIME, Value
lgl (1): Flag Codes
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_foreign
df_foreign |> select(-Value) |> lapply(unique)
$LOCATION
[1] "AUT" "BEL" "DNK" "FIN" "DEU" "HUN" "IRL" "ITA" "JPN" "KOR" "LUX" "NLD" "NOR" "PRT"
[15] "SVK" "SWE" "CHE" "GBR" "USA" "SVN" "CZE" "FRA" "POL" "ESP" "GRC" "EST" "ISL" "CAN"
[29] "MEX" "TUR" "CHL" "LVA" "LTU"
$INDICATOR
[1] "FPOP"
$SUBJECT
[1] "TOT"
$MEASURE
[1] "PC_POP"
$FREQUENCY
[1] "A"
$TIME
[1] 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
[18] 2017 2018 2019
$`Flag Codes`
[1] NA
Add country names using the countrycode package and
delete INDICATOR, MEASURE, FREQUENCY, Flag Codes.
df_fpop <- df_foreign |>
mutate(country = countrycode(LOCATION, "iso3c", "country.name"), .before = LOCATION) |>
select(country, iso3c = LOCATION, year = TIME, foreign = Value)
df_fpop
df_fpop |> filter(country == "Japan") |>
ggplot(aes(year, foreign)) + geom_line() +
labs(title = "Foreign Population in Japan (Percent)")
df_fpop |>
ggplot(aes(year, foreign)) + geom_line(aes(col = iso3c)) +
geom_smooth(formula = 'y~x', method = "loess", se = FALSE) +
labs(title = "Total Foreign Population (%)")
df_fpop |> filter(year == 2019) |>
ggplot(aes(fct_reorder(country, foreign), foreign)) + geom_col() +
coord_flip() + labs(title = "Foreign Population (%)", x = "", y = "")
Data Site: https://data.oecd.org/pop/population.htm
Definition of Population: Population is defined as all nationals present in, or temporarily absent from a country, and aliens permanently settled in a country. This indicator shows the number of people that usually live in an area. Growth rates are the annual changes in population resulting from births, deaths and net migration during the year. Total population includes the following: national armed forces stationed abroad; merchant seamen at sea; diplomatic personnel located abroad; civilian aliens resident in the country; displaced persons resident in the country. However, it excludes the following: foreign armed forces stationed in the country; foreign diplomatic personnel located in the country; civilian aliens temporarily in the country. Population projections are a common demographic tool. They provide a basis for other statistical projections, helping governments in their decision making. This indicator is measured in terms of annual growth rate and in thousands of people.
Citation: OECD (2024), Population (indicator). doi: 10.1787/d434f82b-en (Accessed on 28 January 2024)
df_oecd_pop <- read_csv("data/DP_LIVE_28012024064002957.csv")
Rows: 12264 Columns: 8── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): LOCATION, INDICATOR, SUBJECT, MEASURE, FREQUENCY
dbl (2): TIME, Value
lgl (1): Flag Codes
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_oecd_pop
df_oecd_pop |> select(-Value) |> lapply(unique)
$LOCATION
[1] "AUS" "AUT" "BEL" "CAN" "CZE" "DNK" "FIN" "FRA" "DEU" "GRC" "HUN" "ISL"
[13] "IRL" "ITA" "JPN" "KOR" "LUX" "MEX" "NLD" "NZL" "NOR" "POL" "PRT" "SVK"
[25] "ESP" "SWE" "CHE" "TUR" "GBR" "USA" "BRA" "CHL" "COL" "EST" "ISR" "RUS"
[37] "SVN" "CHN" "IND" "IDN" "ZAF" "OECD" "LVA" "ARG" "BGR" "CRI" "HRV" "CYP"
[49] "LTU" "MLT" "ROU" "SAU" "SGP" "WLD" "G20" "EU27"
$INDICATOR
[1] "POP"
$SUBJECT
[1] "MEN" "TOT" "WOMEN"
$MEASURE
[1] "MLN_PER"
$FREQUENCY
[1] "A"
$TIME
[1] 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966
[18] 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
[35] 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
[52] 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
[69] 2018 2019 2020 2021 2022
$`Flag Codes`
[1] NA
df_pop_short <- df_oecd_pop |> filter(SUBJECT == "TOT") |> select(iso3c = LOCATION, year = TIME, pop = Value)
Check the differences of countries in each datasets.
setdiff(df_fpop$iso3c, df_in$iso3c); setdiff(df_in$iso3c,df_fpop$iso3c)
[1] "HUN" "SVK" "SVN" "GRC" "ISL" "TUR" "CHL" "LVA" "LTU"
[1] "AUS" "NZL" "ISR" "RUS"
setdiff(df_fpop$iso3c, df_pop_short$iso3c); setdiff(df_in$iso3c, df_pop_short$iso3c)
character(0)
character(0)
df_fpop but not in df_in.df_in but not in
df_fpop.df_pop_short contains the data of countries in
df_in and df_fpop.df_fpop2 <- df_fpop |> select(iso3c, year, foreign)
df_oecd_fpop <- df_in |> full_join(df_fpop2, by = c("iso3c", "year")) |>
left_join(df_pop_short, by = c("iso3c", "year")) |>
mutate(foreign_pop = round(pop*10000*foreign), .after = value)
df_oecd_fpop
df_oecd_fpop |> filter(category == "TOT") |> drop_na(value, foreign) |>
filter(value >0, foreign >0) |>
ggplot(aes(value, foreign_pop)) + geom_point(aes(col = iso3c)) + scale_x_log10() + scale_y_log10() + geom_smooth(formula = 'y~x', method = "lm", se = FALSE)
Popular statistical tables
Explorer - datamarts: http://data.un.org/Explorer.aspx
Copy the link of International Migrants and Refugees
un_migrants_url <- "https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv"
download.file(un_migrants_url, destfile = "data/migrants.csv")
trying URL 'https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv'
Content type 'application/octet-stream' length 1634042 bytes (1.6 MB)
==================================================
downloaded 1.6 MB
df_un_migrants <- read_csv("data/migrants.csv")
New names:Rows: 7239 Columns: 7── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): T04, International migrants and refugees, ...3, ...4, ...5, ...6, ...7
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_migrants
df_un_migrants <- read_csv("data/migrants.csv", skip=1)
New names:Rows: 7238 Columns: 7── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ...2, Series, Footnotes, Source
dbl (2): Region/Country/Area, Year
num (1): Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_migrants
str(df_un_migrants)
spc_tbl_ [7,238 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ Region/Country/Area: num [1:7238] 1 1 1 1 1 1 1 1 1 1 ...
$ ...2 : chr [1:7238] "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" ...
$ Year : num [1:7238] 2005 2005 2005 2005 2010 ...
$ Series : chr [1:7238] "International migrant stock: Both sexes (number)" "International migrant stock: Both sexes (% total population)" "International migrant stock: Male (% total Population)" "International migrant stock: Female (% total Population)" ...
$ Value : num [1:7238] 1.91e+08 2.90 3.00 2.90 2.21e+08 ...
$ Footnotes : chr [1:7238] NA NA NA NA ...
$ Source : chr [1:7238] "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." ...
- attr(*, "spec")=
.. cols(
.. `Region/Country/Area` = col_double(),
.. ...2 = col_character(),
.. Year = col_double(),
.. Series = col_character(),
.. Value = col_number(),
.. Footnotes = col_character(),
.. Source = col_character()
.. )
- attr(*, "problems")=<externalptr>
df_un_migrants |> summary()
Region/Country/Area ...2 Year Series
Min. : 1.0 Length:7238 Min. :2005 Length:7238
1st Qu.:178.0 Class :character 1st Qu.:2010 Class :character
Median :404.0 Mode :character Median :2015 Mode :character
Mean :407.1 Mean :2014
3rd Qu.:630.0 3rd Qu.:2020
Max. :894.0 Max. :2022
Value Footnotes Source
Min. : 0 Length:7238 Length:7238
1st Qu.: 5 Class :character Class :character
Median : 154 Mode :character Mode :character
Mean : 731059
3rd Qu.: 31096
Max. :280598105
df_un_migrants |> select(Year,Series) |> lapply(unique)
$Year
[1] 2005 2010 2015 2020 2022 2019 2018 2016 2021 2017
$Series
[1] "International migrant stock: Both sexes (number)"
[2] "International migrant stock: Both sexes (% total population)"
[3] "International migrant stock: Male (% total Population)"
[4] "International migrant stock: Female (% total Population)"
[5] "Total refugees and people in refugee-like situations (number)"
[6] "Asylum seekers, including pending cases (number)"
[7] "Other of concern to UNHCR (number)"
[8] "Total population of concern to UNHCR (number)"
df_un_migrants |> distinct(`Region/Country/Area`, ...2)
library(countrycode)
df_un_migrants %>% mutate(iso3c = countrycode(`Region/Country/Area`, "un", "iso3c"), .after = ...2)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = countrycode(`Region/Country/Area`, "un", "iso3c")`.
Caused by warning:
! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants |> mutate(iso3c = countrycode(`Region/Country/Area`, "un", "iso3c"), .after = ...2) |> filter(is.na(iso3c)) |> distinct(...2)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = countrycode(`Region/Country/Area`, "un", "iso3c")`.
Caused by warning:
! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
wdicache <- read_rds("data/wdicache.rds")
wdi_country_extra <- wdicache$country |> select(iso3c, region, income, lending)
df_un_migrants_ext <- df_un_migrants |>
mutate(iso3c = countrycode(`Region/Country/Area`, "un", "iso3c"), .after = ...2) |>
drop_na(iso3c) |>
select(country = ...2, iso3c, year = Year, series = Series, value = Value, footnotes = Footnotes) |>
left_join(wdi_country_extra, by = "iso3c")
df_un_migrants_ext
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `iso3c = countrycode(`Region/Country/Area`, "un", "iso3c")`.
Caused by warning:
! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants_ext$Series |> unique()
[1] "International migrant stock: Both sexes (number)"
[2] "International migrant stock: Both sexes (% total population)"
[3] "International migrant stock: Male (% total Population)"
[4] "International migrant stock: Female (% total Population)"
[5] "Total refugees and people in refugee-like situations (number)"
[6] "Asylum seekers, including pending cases (number)"
[7] "Other of concern to UNHCR (number)"
[8] "Total population of concern to UNHCR (number)"
df_un_migrants_ext_rev <- df_un_migrants_ext |> mutate(ser = case_when(
Series == "International migrant stock: Both sexes (number)" ~ "migrant",
Series == "International migrant stock: Both sexes (% total population)" ~ "migrant_percent",
Series == "International migrant stock: Male (% total Population)" ~ "migrant_male",
Series == "International migrant stock: Female (% total Population)" ~ "migrant_female",
Series == "Total refugees and people in refugee-like situations (number)" ~ "refugee",
Series == "Asylum seekers, including pending cases (number)" ~ "asylum",
Series == "Other of concern to UNHCR (number)" ~ "other",
Series == "Total population of concern to UNHCR (number)" ~ "concern",
TRUE ~ Series), .before = Series)
df_un_migrants_ext_rev
df_oecd_un <- df_oecd_fpop |> left_join(df_un_migrants_ext_rev, by = c("country", "iso3c", "year")) |>
select(country, iso3c, category, year, value, foreign_pop, foreign, pop, ser, migrants)
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
df_oecd_un
df_oecd_un |> filter(country == "Japan")
df_oecd_un_wide <- df_oecd_un |> drop_na(value, migrants) |> pivot_wider(names_from = category, values_from = value) |>
pivot_wider(names_from = ser, values_from = migrants)
df_oecd_un_wide
df_oecd_un_wide |> str()
tibble [117 × 21] (S3: tbl_df/tbl/data.frame)
$ country : chr [1:117] "Australia" "Australia" "Australia" "Australia" ...
$ iso3c : chr [1:117] "AUS" "AUS" "AUS" "AUS" ...
$ year : num [1:117] 2005 2010 2015 2020 2022 ...
$ foreign_pop : num [1:117] NA NA NA NA NA ...
$ foreign : num [1:117] NA NA NA NA NA 9.4 10.5 13.4 NA NA ...
$ pop : num [1:117] 20.2 22 23.8 25.7 26 ...
$ FAM : num [1:117] 43747 60254 61085 44442 54294 ...
$ FAMWORKR : num [1:117] 40739 61377 68292 51899 48041 ...
$ FREEMOVS : num [1:117] 22379 29051 25081 11915 9257 ...
$ HUMNTRN : num [1:117] 17528 14553 13759 13171 13307 ...
$ OTH : num [1:117] 1772 501 238 81 199 ...
$ WORK : num [1:117] 41154 46491 59482 43944 41022 ...
$ TOT : num [1:117] 167319 212227 227937 165452 166120 ...
$ migrant : num [1:117] 4878030 5882980 6729730 7685860 NA ...
$ migrant_percent: num [1:117] 24.2 26.6 28.1 30.1 NA 13.7 15.2 17.1 19.3 NA ...
$ migrant_male : num [1:117] 24 26.4 28 30 NA 13.4 14.8 16.8 19.1 NA ...
$ migrant_female : num [1:117] 24.3 26.7 28.2 30.3 NA 13.9 15.5 17.3 19.5 NA ...
$ refugee : num [1:117] 64964 21805 35582 58607 53523 ...
$ asylum : num [1:117] 1822 3760 22837 78800 84950 ...
$ other : num [1:117] 8 15 NA NA 17252 ...
$ concern : num [1:117] 66794 25580 58419 137407 148076 ...
[Link]
Table in Excel, Filtered data in tableau text (CSV), Unfiltered data in tableau text (CSV) 136.4MB
temp <- read_csv("../../../bigdata/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0+all.csv")
temp
library(readxl)
df_oecd_ed <- read_excel("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0,filtered,2024-01-24 10-22-31.xlsx")
df_oecd_ed
df_oecd_ed <- read_excel("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0,filtered,2024-01-24 10-22-31.xlsx", skip = 6)
df_oecd_ed
colnames(df_oecd_ed)
df_oecd_ed_short <- df_oecd_ed |> select(1,3,4,5,6,9,10) |> drop_na(`Primary education`)
df_oecd_ed_short
df_oecd_ed2 <- read_csv("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0+..ISCED11_1+ISCED11_2+ISCED11_3+ISCED11_4+ISCED11_5+ISCED11_5T8+ISCED11_6T8+ISCED11_1T8._T.INST_EDU.DIR_EXP...csv")
df_oecd_ed2
Data Analysis for Researchers. [Link]
Posit Recipes(New edition of Posit Primers): interactive exercises [Link]
Cheat Sheet. [Site Link]
dplyr: A Grammar of Data Manipulation [Link]
Model Summary
r-statistics.co by Selva Prabhakaran:
Meaning Behind Each Section of Summary()
swirl: install.packages("swirl")
libray(swirl), and swirl() after
rm(list=ls())